data_path = "/Users/apple/Downloads/Concrete_Data.csv"
# File was initially converted to csv from xls using bash script because OSX
# does not have proper support for csv

library(data.table)
library(ggplot2)
library(stats)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:mosaic':
## 
##     dotPlot
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following objects are masked from 'package:mosaic':
## 
##     deltaMethod, logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(readxl)
library(ggpubr)
library(BBmisc)
## 
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse
## The following object is masked from 'package:grid':
## 
##     explode
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following object is masked from 'package:base':
## 
##     isFALSE
#
# 1. Read the dataset into the R environment
#

data = fread(data_path, check.names = TRUE)
head(data)
##    Cement..component.1..kg.in.a.m.3.mixture.
## 1:                                    540.0 
## 2:                                    540.0 
## 3:                                    332.5 
## 4:                                    332.5 
## 5:                                    198.6 
## 6:                                    266.0 
##    Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## 1:                                                  0.0 
## 2:                                                  0.0 
## 3:                                                142.5 
## 4:                                                142.5 
## 5:                                                132.4 
## 6:                                                114.0 
##    Fly.Ash..component.3..kg.in.a.m.3.mixture.
## 1:                                       0.0 
## 2:                                       0.0 
## 3:                                       0.0 
## 4:                                       0.0 
## 5:                                       0.0 
## 6:                                       0.0 
##    Water...component.4..kg.in.a.m.3.mixture.
## 1:                                    162.0 
## 2:                                    162.0 
## 3:                                    228.0 
## 4:                                    228.0 
## 5:                                    192.0 
## 6:                                    228.0 
##    Superplasticizer..component.5..kg.in.a.m.3.mixture.
## 1:                                                2.5 
## 2:                                                2.5 
## 3:                                                0.0 
## 4:                                                0.0 
## 5:                                                0.0 
## 6:                                                0.0 
##    Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## 1:                                              1040.0 
## 2:                                              1055.0 
## 3:                                               932.0 
## 4:                                               932.0 
## 5:                                               978.4 
## 6:                                               932.0 
##    Fine.Aggregate..component.7..kg.in.a.m.3.mixture. Age..day.
## 1:                                            676.0        28 
## 2:                                            676.0        28 
## 3:                                            594.0       270 
## 4:                                            594.0       365 
## 5:                                            825.5       360 
## 6:                                            670.0        90 
##    Concrete.compressive.strength.MPa..megapascals..
## 1:                                           79.99 
## 2:                                           61.89 
## 3:                                           40.27 
## 4:                                           41.05 
## 5:                                           44.30 
## 6:                                           47.03
#
# 2. Descriptive Analysis
#

# Structure of the data

str(data)
## Classes 'data.table' and 'data.frame':   1030 obs. of  9 variables:
##  $ Cement..component.1..kg.in.a.m.3.mixture.            : chr  "540.0 " "540.0 " "332.5 " "332.5 " ...
##  $ Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.: chr  "0.0 " "0.0 " "142.5 " "142.5 " ...
##  $ Fly.Ash..component.3..kg.in.a.m.3.mixture.           : chr  "0.0 " "0.0 " "0.0 " "0.0 " ...
##  $ Water...component.4..kg.in.a.m.3.mixture.            : chr  "162.0 " "162.0 " "228.0 " "228.0 " ...
##  $ Superplasticizer..component.5..kg.in.a.m.3.mixture.  : chr  "2.5 " "2.5 " "0.0 " "0.0 " ...
##  $ Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. : chr  "1040.0 " "1055.0 " "932.0 " "932.0 " ...
##  $ Fine.Aggregate..component.7..kg.in.a.m.3.mixture.    : chr  "676.0 " "676.0 " "594.0 " "594.0 " ...
##  $ Age..day.                                            : chr  "28 " "28 " "270 " "365 " ...
##  $ Concrete.compressive.strength.MPa..megapascals..     : chr  "79.99 " "61.89 " "40.27 " "41.05 " ...
##  - attr(*, ".internal.selfref")=<externalptr>
# Column names

column_names = names(data)
target_column = column_names[length(column_names)]
input_column = column_names[1:length(column_names)-1]
column_names
## [1] "Cement..component.1..kg.in.a.m.3.mixture."            
## [2] "Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture."
## [3] "Fly.Ash..component.3..kg.in.a.m.3.mixture."           
## [4] "Water...component.4..kg.in.a.m.3.mixture."            
## [5] "Superplasticizer..component.5..kg.in.a.m.3.mixture."  
## [6] "Coarse.Aggregate...component.6..kg.in.a.m.3.mixture." 
## [7] "Fine.Aggregate..component.7..kg.in.a.m.3.mixture."    
## [8] "Age..day."                                            
## [9] "Concrete.compressive.strength.MPa..megapascals.."
target_column
## [1] "Concrete.compressive.strength.MPa..megapascals.."
input_column
## [1] "Cement..component.1..kg.in.a.m.3.mixture."            
## [2] "Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture."
## [3] "Fly.Ash..component.3..kg.in.a.m.3.mixture."           
## [4] "Water...component.4..kg.in.a.m.3.mixture."            
## [5] "Superplasticizer..component.5..kg.in.a.m.3.mixture."  
## [6] "Coarse.Aggregate...component.6..kg.in.a.m.3.mixture." 
## [7] "Fine.Aggregate..component.7..kg.in.a.m.3.mixture."    
## [8] "Age..day."
# Dimension of data
dim(data)
## [1] 1030    9
# Data Preprocessing

# Convert character columns to numeric

convertToNumeric = function(X) {
  X1 = as.numeric(X)
  return(X1)
}

data = data[, lapply(.SD,convertToNumeric)]

# Check for null values

colSums(is.na(data)) # -> No null values
##             Cement..component.1..kg.in.a.m.3.mixture. 
##                                                     0 
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 
##                                                     0 
##            Fly.Ash..component.3..kg.in.a.m.3.mixture. 
##                                                     0 
##             Water...component.4..kg.in.a.m.3.mixture. 
##                                                     0 
##   Superplasticizer..component.5..kg.in.a.m.3.mixture. 
##                                                     0 
##  Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 
##                                                     0 
##     Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 
##                                                     0 
##                                             Age..day. 
##                                                     0 
##      Concrete.compressive.strength.MPa..megapascals.. 
##                                                     0
# Histogram for numerical data

par(mfrow=c(3,3))
for(name in names(data)) {
  X = data[[name]]
  print(name)
  print(summary(X))
  hist(X, main=name)
}
## [1] "Cement..component.1..kg.in.a.m.3.mixture."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   102.0   192.4   272.9   281.2   350.0   540.0
## [1] "Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   22.00   73.89  142.95  359.40
## [1] "Fly.Ash..component.3..kg.in.a.m.3.mixture."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   54.19  118.30  200.10
## [1] "Water...component.4..kg.in.a.m.3.mixture."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   121.8   164.9   185.0   181.6   192.0   247.0
## [1] "Superplasticizer..component.5..kg.in.a.m.3.mixture."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   6.300   6.203  10.200  32.200
## [1] "Coarse.Aggregate...component.6..kg.in.a.m.3.mixture."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   801.0   932.0   968.0   972.9  1029.4  1145.0
## [1] "Fine.Aggregate..component.7..kg.in.a.m.3.mixture."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   594.0   731.0   779.5   773.6   824.0   992.6
## [1] "Age..day."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   28.00   45.66   56.00  365.00
## [1] "Concrete.compressive.strength.MPa..megapascals.."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.33   23.71   34.45   35.82   46.13   82.60

par(mfrow=c(1,1))

# Density plot for target variable

ggdensity(data, x = "Concrete.compressive.strength.MPa..megapascals..", 
          fill = "#0073C2FF", color = "#0073C2FF",
          add = "mean", rug = TRUE)
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.
## Warning: geom_vline(): Ignoring `data` because `xintercept` was provided.

# Boxplot for input columns

for(name in input_column) {
  boxplot(data[[name]], main=name)
}

# QQ Plot for input columns against target column

for(name in input_column) {
  qqplot(data[[name]], data$Concrete.compressive.strength.MPa..megapascals.., xlab=name)
}

# Analysis on each column

## Column Cement..component.1..kg.in.a.m.3.mixture.

a = ggplot(data, aes(x = Cement..component.1..kg.in.a.m.3.mixture.))

a + geom_density() +
  geom_vline(aes(xintercept = mean(Cement..component.1..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(Cement..component.1..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

## Column Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.

a = ggplot(data, aes(x = Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.))

a + geom_density() +
  geom_vline(aes(xintercept = mean(Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

## Column Fly.Ash..component.3..kg.in.a.m.3.mixture.

a = ggplot(data, aes(x = Fly.Ash..component.3..kg.in.a.m.3.mixture.))

a + geom_density() +
  geom_vline(aes(xintercept = mean(Fly.Ash..component.3..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(Fly.Ash..component.3..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

## Column Water...component.4..kg.in.a.m.3.mixture.

a = ggplot(data, aes(x = Water...component.4..kg.in.a.m.3.mixture.))

a + geom_density() +
  geom_vline(aes(xintercept = mean(Water...component.4..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(Water...component.4..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

## Column Superplasticizer..component.5..kg.in.a.m.3.mixture.

a = ggplot(data, aes(x = Superplasticizer..component.5..kg.in.a.m.3.mixture.))

a + geom_density() +
  geom_vline(aes(xintercept = mean(Superplasticizer..component.5..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(Superplasticizer..component.5..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

## Column Fine.Aggregate..component.7..kg.in.a.m.3.mixture.

a = ggplot(data, aes(x = Fine.Aggregate..component.7..kg.in.a.m.3.mixture.))

a + geom_density() +
  geom_vline(aes(xintercept = mean(Fine.Aggregate..component.7..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(Fine.Aggregate..component.7..kg.in.a.m.3.mixture.)), 
             linetype = "dashed", size = 0.6)

## Column Age..day.

a = ggplot(data, aes(x = Age..day.))

a + geom_density() +
  geom_vline(aes(xintercept = mean(Age..day.)), 
             linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(Age..day.)), 
             linetype = "dashed", size = 0.6)

#
# 3. Perform required Diagnostic data analytics on the explored dataset.
#

# Correlation Coefficients

cor(select_if(data, is.numeric)[,])
##                                                       Cement..component.1..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture.                                            1.00000000
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                               -0.27520026
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                          -0.39747855
## Water...component.4..kg.in.a.m.3.mixture.                                           -0.08150687
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                  0.09241390
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                -0.10936104
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                   -0.22270327
## Age..day.                                                                            0.08194618
## Concrete.compressive.strength.MPa..megapascals..                                     0.49782924
##                                                       Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture.                                                       -0.27520026
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                                            1.00000000
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                                      -0.32358377
## Water...component.4..kg.in.a.m.3.mixture.                                                        0.10734660
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                              0.04338346
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                            -0.28400776
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                               -0.28160129
## Age..day.                                                                                       -0.04424505
## Concrete.compressive.strength.MPa..megapascals..                                                 0.13482625
##                                                       Fly.Ash..component.3..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture.                                           -0.397478547
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                               -0.323583770
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                           1.000000000
## Water...component.4..kg.in.a.m.3.mixture.                                           -0.257057836
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                  0.377399697
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                -0.009979403
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                    0.079094391
## Age..day.                                                                           -0.154371716
## Concrete.compressive.strength.MPa..megapascals..                                    -0.105758502
##                                                       Water...component.4..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture.                                           -0.08150687
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                                0.10734660
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                          -0.25705784
## Water...component.4..kg.in.a.m.3.mixture.                                            1.00000000
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                 -0.65746099
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                -0.18236084
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                   -0.45069081
## Age..day.                                                                            0.27760928
## Concrete.compressive.strength.MPa..megapascals..                                    -0.28960079
##                                                       Superplasticizer..component.5..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture.                                                      0.09241390
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                                          0.04338346
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                                     0.37739970
## Water...component.4..kg.in.a.m.3.mixture.                                                     -0.65746099
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                            1.00000000
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                          -0.26608660
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                              0.22258833
## Age..day.                                                                                     -0.19268924
## Concrete.compressive.strength.MPa..megapascals..                                               0.36602184
##                                                       Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture.                                                     -0.109361038
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                                         -0.284007756
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                                    -0.009979403
## Water...component.4..kg.in.a.m.3.mixture.                                                     -0.182360840
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                           -0.266086598
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                           1.000000000
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                             -0.178496507
## Age..day.                                                                                     -0.003015880
## Concrete.compressive.strength.MPa..megapascals..                                              -0.164934614
##                                                       Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture.                                                   -0.22270327
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                                       -0.28160129
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                                   0.07909439
## Water...component.4..kg.in.a.m.3.mixture.                                                   -0.45069081
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                          0.22258833
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                        -0.17849651
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                            1.00000000
## Age..day.                                                                                   -0.15609400
## Concrete.compressive.strength.MPa..megapascals..                                            -0.16723752
##                                                         Age..day.
## Cement..component.1..kg.in.a.m.3.mixture.              0.08194618
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. -0.04424505
## Fly.Ash..component.3..kg.in.a.m.3.mixture.            -0.15437172
## Water...component.4..kg.in.a.m.3.mixture.              0.27760928
## Superplasticizer..component.5..kg.in.a.m.3.mixture.   -0.19268924
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.  -0.00301588
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.     -0.15609400
## Age..day.                                              1.00000000
## Concrete.compressive.strength.MPa..megapascals..       0.32887300
##                                                       Concrete.compressive.strength.MPa..megapascals..
## Cement..component.1..kg.in.a.m.3.mixture.                                                    0.4978292
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.                                        0.1348262
## Fly.Ash..component.3..kg.in.a.m.3.mixture.                                                  -0.1057585
## Water...component.4..kg.in.a.m.3.mixture.                                                   -0.2896008
## Superplasticizer..component.5..kg.in.a.m.3.mixture.                                          0.3660218
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.                                        -0.1649346
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.                                           -0.1672375
## Age..day.                                                                                    0.3288730
## Concrete.compressive.strength.MPa..megapascals..                                             1.0000000
# Correlation Plot

M = cor(select_if(data, is.numeric))
corrplot(M, method = "number", tl.pos='n') # Due to large size of column names, they are removed

# Statistical Analysis

favstats(~Cement..component.1..kg.in.a.m.3.mixture., data=data)
##  min      Q1 median  Q3 max     mean       sd    n missing
##  102 192.375  272.9 350 540 281.1664 104.5077 1030       0
favstats(~Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture., data=data)
##  min Q1 median     Q3   max     mean       sd    n missing
##    0  0     22 142.95 359.4 73.89485 86.27934 1030       0
favstats(~Fly.Ash..component.3..kg.in.a.m.3.mixture., data=data)
##  min Q1 median    Q3   max     mean       sd    n missing
##    0  0      0 118.3 200.1 54.18738 63.99596 1030       0
favstats(~Water...component.4..kg.in.a.m.3.mixture., data=data)
##    min    Q1 median  Q3 max     mean       sd    n missing
##  121.8 164.9    185 192 247 181.5649 21.35566 1030       0
favstats(~Superplasticizer..component.5..kg.in.a.m.3.mixture., data=data)
##  min Q1 median   Q3  max     mean       sd    n missing
##    0  0    6.3 10.2 32.2 6.203204 5.973035 1030       0
favstats(~Coarse.Aggregate...component.6..kg.in.a.m.3.mixture., data=data)
##  min  Q1 median     Q3  max     mean       sd    n missing
##  801 932    968 1029.4 1145 972.9189 77.75395 1030       0
favstats(~Fine.Aggregate..component.7..kg.in.a.m.3.mixture., data=data)
##  min     Q1 median  Q3   max     mean      sd    n missing
##  594 730.95  779.5 824 992.6 773.5795 80.1758 1030       0
favstats(~Age..day., data=data)
##  min Q1 median Q3 max     mean       sd    n missing
##    1  7     28 56 365 45.66214 63.16991 1030       0
favstats(~Concrete.compressive.strength.MPa..megapascals.., data=data)
##   min    Q1 median     Q3  max     mean       sd    n missing
##  2.33 23.71 34.445 46.135 82.6 35.81796 16.70574 1030       0
summary(data)
##  Cement..component.1..kg.in.a.m.3.mixture.
##  Min.   :102.0                            
##  1st Qu.:192.4                            
##  Median :272.9                            
##  Mean   :281.2                            
##  3rd Qu.:350.0                            
##  Max.   :540.0                            
##  Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
##  Min.   :  0.00                                       
##  1st Qu.:  0.00                                       
##  Median : 22.00                                       
##  Mean   : 73.89                                       
##  3rd Qu.:142.95                                       
##  Max.   :359.40                                       
##  Fly.Ash..component.3..kg.in.a.m.3.mixture.
##  Min.   :  0.00                            
##  1st Qu.:  0.00                            
##  Median :  0.00                            
##  Mean   : 54.19                            
##  3rd Qu.:118.30                            
##  Max.   :200.10                            
##  Water...component.4..kg.in.a.m.3.mixture.
##  Min.   :121.8                            
##  1st Qu.:164.9                            
##  Median :185.0                            
##  Mean   :181.6                            
##  3rd Qu.:192.0                            
##  Max.   :247.0                            
##  Superplasticizer..component.5..kg.in.a.m.3.mixture.
##  Min.   : 0.000                                     
##  1st Qu.: 0.000                                     
##  Median : 6.300                                     
##  Mean   : 6.203                                     
##  3rd Qu.:10.200                                     
##  Max.   :32.200                                     
##  Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
##  Min.   : 801.0                                      
##  1st Qu.: 932.0                                      
##  Median : 968.0                                      
##  Mean   : 972.9                                      
##  3rd Qu.:1029.4                                      
##  Max.   :1145.0                                      
##  Fine.Aggregate..component.7..kg.in.a.m.3.mixture.   Age..day.     
##  Min.   :594.0                                     Min.   :  1.00  
##  1st Qu.:731.0                                     1st Qu.:  7.00  
##  Median :779.5                                     Median : 28.00  
##  Mean   :773.6                                     Mean   : 45.66  
##  3rd Qu.:824.0                                     3rd Qu.: 56.00  
##  Max.   :992.6                                     Max.   :365.00  
##  Concrete.compressive.strength.MPa..megapascals..
##  Min.   : 2.33                                   
##  1st Qu.:23.71                                   
##  Median :34.45                                   
##  Mean   :35.82                                   
##  3rd Qu.:46.13                                   
##  Max.   :82.60
# Normalise the data

data = normalize(data, method = "standardize", range = c(0, 1), margin = 1L, on.constant = "quiet")

#
# 4. Check for the assumptions of Regression on the loaded dataset.
#

# Assumption 1 : Checking if linear regression is linear in parameters

input_form = paste(input_column, collapse="+")
formula = as.formula(paste("Concrete.compressive.strength.MPa..megapascals.. ~ ", input_form))

mod <- lm(formula=formula, data=data)
summary(mod)
## 
## Call:
## lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.71518 -0.37728  0.04205  0.39297  2.06192 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                           -8.666e-16  1.940e-02
## Cement..component.1..kg.in.a.m.3.mixture.              7.493e-01  5.311e-02
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.  5.363e-01  5.235e-02
## Fly.Ash..component.3..kg.in.a.m.3.mixture.             3.368e-01  4.821e-02
## Water...component.4..kg.in.a.m.3.mixture.             -1.920e-01  5.137e-02
## Superplasticizer..component.5..kg.in.a.m.3.mixture.    1.042e-01  3.341e-02
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.   8.396e-02  4.373e-02
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.      9.670e-02  5.137e-02
## Age..day.                                              4.319e-01  2.052e-02
##                                                       t value Pr(>|t|)    
## (Intercept)                                             0.000 1.000000    
## Cement..component.1..kg.in.a.m.3.mixture.              14.110  < 2e-16 ***
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.  10.245  < 2e-16 ***
## Fly.Ash..component.3..kg.in.a.m.3.mixture.              6.987 5.07e-12 ***
## Water...component.4..kg.in.a.m.3.mixture.              -3.739 0.000195 ***
## Superplasticizer..component.5..kg.in.a.m.3.mixture.     3.118 0.001870 ** 
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.    1.920 0.055122 .  
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.       1.883 0.060044 .  
## Age..day.                                              21.047  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6225 on 1021 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.6125 
## F-statistic: 204.3 on 8 and 1021 DF,  p-value: < 2.2e-16
# Assumption 2 : The mean of residuals is zero -> Holds

mean(mod$residuals)
## [1] -2.90011e-17
# Assumption 3 : Homoscedasticity of residuals or equal variance -> Holds
par(mfrow=c(2,2))
plot(mod)

par(mfrow=c(1,1))
# Assumption  4 : No autocorrelation of residuals -> Holds
acf(mod$residuals)

# Assumption 5 : Input and Target columns are uncorrelated -> Holds

checkCorrelationWithMod = function(X) {
  print(cor.test(X, mod$residuals))
}

lapply(data, checkCorrelationWithMod)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -7.5441e-16, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## -2.35294e-17 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -2.8522e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##           cor 
## -8.895645e-17 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -1.8506e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##           cor 
## -5.772011e-17 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 1.1031e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 3.440393e-17 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 2.658e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 8.290149e-17 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 3.1017e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 9.673911e-17 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -6.0088e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##           cor 
## -1.874095e-16 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 2.7321e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 8.521194e-17 
## 
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 25.341, df = 1028, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5809978 0.6563001
## sample estimates:
##      cor 
## 0.620075
## $Cement..component.1..kg.in.a.m.3.mixture.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -7.5441e-16, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## -2.35294e-17 
## 
## 
## $Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -2.8522e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##           cor 
## -8.895645e-17 
## 
## 
## $Fly.Ash..component.3..kg.in.a.m.3.mixture.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -1.8506e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##           cor 
## -5.772011e-17 
## 
## 
## $Water...component.4..kg.in.a.m.3.mixture.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 1.1031e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 3.440393e-17 
## 
## 
## $Superplasticizer..component.5..kg.in.a.m.3.mixture.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 2.658e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 8.290149e-17 
## 
## 
## $Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 3.1017e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 9.673911e-17 
## 
## 
## $Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -6.0088e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##           cor 
## -1.874095e-16 
## 
## 
## $Age..day.
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 2.7321e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06108321  0.06108321
## sample estimates:
##          cor 
## 8.521194e-17 
## 
## 
## $Concrete.compressive.strength.MPa..megapascals..
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 25.341, df = 1028, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5809978 0.6563001
## sample estimates:
##      cor 
## 0.620075
# Assumption 6 : The number of observations must be greater than number of Xs -> Holds

nrow(data) > ncol(data)
## [1] TRUE
# Assumption 7 : Variability of X -> Holds

checkVariate = function(X) {
  print(var(X))
}

lapply(data, checkVariate)
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## $Cement..component.1..kg.in.a.m.3.mixture.
## [1] 1
## 
## $Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## [1] 1
## 
## $Fly.Ash..component.3..kg.in.a.m.3.mixture.
## [1] 1
## 
## $Water...component.4..kg.in.a.m.3.mixture.
## [1] 1
## 
## $Superplasticizer..component.5..kg.in.a.m.3.mixture.
## [1] 1
## 
## $Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## [1] 1
## 
## $Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
## [1] 1
## 
## $Age..day.
## [1] 1
## 
## $Concrete.compressive.strength.MPa..megapascals..
## [1] 1
# Assumption 8 : Check if X and Y have inverse relationship -> Does not hold for Age..dat column

for(i in input_column) {
  plot(data[[i]], data$Concrete.compressive.strength.MPa..megapascals.., xlab = i)
}

# Assumption 9 :-No perfect multicollinearity -> Does not hold

vif(mod)
##             Cement..component.1..kg.in.a.m.3.mixture. 
##                                              7.489003 
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 
##                                              7.277091 
##            Fly.Ash..component.3..kg.in.a.m.3.mixture. 
##                                              6.171400 
##             Water...component.4..kg.in.a.m.3.mixture. 
##                                              7.006340 
##   Superplasticizer..component.5..kg.in.a.m.3.mixture. 
##                                              2.963862 
##  Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 
##                                              5.077042 
##     Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 
##                                              7.006677 
##                                             Age..day. 
##                                              1.118366
# Assumption 10:-Normality of residuals

par(mfrow=c(2,2))
plot(mod)

par(mfrow=c(1,1))

#
# 5. Split the whole dataset into training (80%) and testing (20%).
#

dt = sort(sample(nrow(data), nrow(data)*.8))
train<-data[dt,]
test<-data[-dt,]

#
# 6. Design a predictive model for predicting the target attribute from training data.
#

model = lm(Concrete.compressive.strength.MPa..megapascals..~., data=train)
sigma(model)
## [1] 0.6057782
summary(model)$coef
##                                                           Estimate Std. Error
## (Intercept)                                           -0.006002755 0.02111245
## Cement..component.1..kg.in.a.m.3.mixture.              0.773844472 0.05739294
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.  0.562334081 0.05722500
## Fly.Ash..component.3..kg.in.a.m.3.mixture.             0.362659830 0.05185148
## Water...component.4..kg.in.a.m.3.mixture.             -0.142887962 0.05445239
## Superplasticizer..component.5..kg.in.a.m.3.mixture.    0.124861170 0.03578007
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.   0.094514514 0.04743451
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.      0.132264387 0.05554560
## Age..day.                                              0.442730251 0.02271578
##                                                         t value     Pr(>|t|)
## (Intercept)                                           -0.284323 7.762351e-01
## Cement..component.1..kg.in.a.m.3.mixture.             13.483269 1.505214e-37
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.  9.826721 1.296304e-21
## Fly.Ash..component.3..kg.in.a.m.3.mixture.             6.994204 5.562499e-12
## Water...component.4..kg.in.a.m.3.mixture.             -2.624090 8.850647e-03
## Superplasticizer..component.5..kg.in.a.m.3.mixture.    3.489685 5.095441e-04
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.   1.992527 4.664646e-02
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.      2.381186 1.748577e-02
## Age..day.                                             19.489986 9.639390e-70
#
# 7. Apply the designed model on test data.
#

pred = model%>%predict(test)
summary(model)
## 
## Call:
## lm(formula = Concrete.compressive.strength.MPa..megapascals.. ~ 
##     ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.72646 -0.36095  0.03941  0.38011  1.99629 
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                           -0.006003   0.021112
## Cement..component.1..kg.in.a.m.3.mixture.              0.773844   0.057393
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.  0.562334   0.057225
## Fly.Ash..component.3..kg.in.a.m.3.mixture.             0.362660   0.051851
## Water...component.4..kg.in.a.m.3.mixture.             -0.142888   0.054452
## Superplasticizer..component.5..kg.in.a.m.3.mixture.    0.124861   0.035780
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.   0.094515   0.047435
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.      0.132264   0.055546
## Age..day.                                              0.442730   0.022716
##                                                       t value Pr(>|t|)    
## (Intercept)                                            -0.284  0.77624    
## Cement..component.1..kg.in.a.m.3.mixture.              13.483  < 2e-16 ***
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.   9.827  < 2e-16 ***
## Fly.Ash..component.3..kg.in.a.m.3.mixture.              6.994 5.56e-12 ***
## Water...component.4..kg.in.a.m.3.mixture.              -2.624  0.00885 ** 
## Superplasticizer..component.5..kg.in.a.m.3.mixture.     3.490  0.00051 ***
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.    1.993  0.04665 *  
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.       2.381  0.01749 *  
## Age..day.                                              19.490  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6058 on 815 degrees of freedom
## Multiple R-squared:  0.6371, Adjusted R-squared:  0.6335 
## F-statistic: 178.8 on 8 and 815 DF,  p-value: < 2.2e-16
actual = test$Concrete.compressive.strength.MPa..megapascals..
predicted = pred

range(test$Concrete.compressive.strength.MPa..megapascals..)
## [1] -1.854929  2.749476
#
# 8. Evaluate the designed model by means of RMSE or MAE.
#

rmse(actual, predicted) # In the target range -1.7 to 2.8, our model can accurately predict with an error boundary of 0.6
## [1] 0.688342
mae(actual, predicted)
## [1] 0.5461329